library("car")
library("tidyverse")
library("magrittr")
library("here")
library("janitor")
library("lubridate")
library("gridExtra")
library("readxl")
library("glmnet")
library("Lahman")
library("viridis")
library("lindia")
library("lme4")
library("caret")
library("pROC")
1. Datasets
[3 + 0 points] Create a new dataset called ‘Peopledata’ that contains all of the variables in the ‘People’ dataset by
- removing all birth information except birthYear and birthCountry and all death information, along with the variable finalGame;
People<-as_tibble(Lahman::People)
People<- People%>%
select(playerID,birthYear,birthCountry,nameFirst,nameLast,weight,height,bats,throws,debut)
ii. replacing birthCountry is by bornUSA, a logical variable indicating if the player was born in the USA;
People%>%
mutate(bornUSA = birthCountry=="USA")%>%
select(!birthCountry) -> Peopledata
[5 + 0 points] Create new datasets called Battingdata and Fieldingdata by
- choosing data from the years 1985 and 2015,
Batting<- as_tibble(Lahman::Batting)
Batting<- Batting%>%
filter(yearID == 1985 |yearID== 2015)
Fielding<- as_tibble(Lahman::Fielding)
Fielding<- Fielding%>%
filter(yearID == 1985 |yearID== 2015)
ii. selecting only those variables that for those years have fewer than 25 missing cases,
Batting%>%
sapply(function(x) sum(is.na(x)) )
playerID yearID stint teamID lgID G AB R H X2B X3B HR RBI SB CS
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
BB SO IBB HBP SH SF GIDP
0 0 0 0 0 0 0
None of the Batting dataset variables have fewer than 25 missing cases
Fielding%>%
sapply(function(x) sum(is.na(x)) )
playerID yearID stint teamID lgID POS G GS InnOuts PO A E DP PB WP
0 0 0 0 0 0 0 0 0 0 0 0 0 2995 3205
SB CS ZR
2995 2995 3205
Fielding<- Fielding%>%
select(-PB, -WP, -SB, -CS, -ZR)
iii. removing the variable 'G' from the batting dataset and removing the variables "teamID" and "lgID" from both datasets,
Fielding%>%
select(-teamID, -lgID)
Fieldingdata<- Fielding
Battingdata<- Batting%>%
select(-G, -teamID, -lgID)
iv. creating a variable in 'Battingdata' called batav which is equal to the number of hits (H) over the number of at bats (AB) if the number of hits >0, and =0 if H=0.
Battingdata<- Battingdata%>%
mutate(batav =
if_else(Battingdata$H > 0 ,Battingdata$H/Battingdata$AB,0))
[6 + 0 points] Create a dataset ‘Playerdata’ from the dataset ‘Salaries’ by
- selecting data from the years 1985 and 2015,
Playerdata<- as_tibble(Lahman::Salaries)
Playerdata<- Playerdata%>%
filter(yearID %in% c("1985","2015"))
ii. adding all distinct variables from the Fieldingdata, Battingdata and Peopledata datasets,
left_join(Playerdata, Fieldingdata)-> Playerdata
Joining, by = c("yearID", "teamID", "lgID", "playerID")
left_join(Playerdata, Battingdata)-> Playerdata
Joining, by = c("yearID", "playerID", "stint")
left_join(Playerdata, Peopledata)-> Playerdata
Joining, by = "playerID"
iii. creating a new variable 'allstar' indicating if the player appears anywhere in the AllstarFull dataset,
AllstarFull<-as_tibble(Lahman::AllstarFull)
Playerdata<- Playerdata%>%
mutate(allstar=playerID%in% AllstarFull$playerID)
iv. creating a new variable 'age' equal to each player's age in the relevant year,
Playerdata<- Playerdata%>%
mutate(age = yearID-birthYear)
iv. dropping incomplete cases from the dataset,
Playerdata<- Playerdata%>%
drop_na()
v. dropping unused levels of any categorical variable.
Playerdata<- Playerdata%>%
droplevels()
#teamID had 33 levels
#lgID had 2 levels
#bats had 3 levels
#throws had 3 levels
[4 + 0 points] Create a dataset called ‘TeamSalaries’ in which there is a row for each team and each year and the variables are:
‘Rostercost’ = the sum of all player salaries for the given team in the given year
‘meansalary’ = the mean salary for that team that year
‘rostersize’ = the number of players listed that year for that team.
as_tibble(Lahman::Salaries)%>%
group_by(teamID,yearID)%>%
summarise(Roster = sum(salary), meansalary = mean(salary), rostersize = n() )-> TeamSalaries
`summarise()` has grouped output by 'teamID'. You can override using the `.groups` argument.
#roster size check different method later
- [2 + 0 points] Create a dataset ‘Teamdata’ by taking the data from the Teams dataset for the years 1984 to 2016, inclusive and adding to that data the variables in TeamSalaries. Drop any incomplete cases from the dataset.
Teams<- as_tibble(Lahman::Teams)
Teams<- Teams%>%
filter(yearID %in% c(1984:2016))
left_join(Teams, TeamSalaries)-> Teamdata
Joining, by = c("yearID", "teamID")
Teamdata<- Teamdata%>%
drop_na()
2. Simple Linear Regression
- [2 + 2 points] Create one plot of mean team salaries over time from 1984 to 2016, and another of the log base 10 of team mean salaries over time from 1984 to 2016. Give two reasons why a linear model is more appropriate for log base 10 mean salaries than for raw mean salaries.
Teamdata%>%
ggplot(mapping=aes(yearID, meansalary))+
geom_point()+
labs(x = "Year ", y = "Mean salary") +
ggtitle("Mean salary over the years")+
geom_smooth(method = "lm", se = FALSE,colour="red")+
theme_classic()
`geom_smooth()` using formula 'y ~ x'

The above plot illustrates an upward drift of points from the left to right, where the mean salary is seen to be increasing over the years.
Teamdata%>%
ggplot(mapping=aes(yearID, log10(meansalary)))+
geom_smooth(method = "lm", se = FALSE,colour="red")+
geom_point()+
labs(x = "Year ", y = "Mean log salary") +
ggtitle("Mean log salary over the years") +
theme_classic()
`geom_smooth()` using formula 'y ~ x'

Here we can see that there is a linear relationship between log(mean salary) and yearID as illustrated by the trendline. The spread in the plot without log salary is wider whereas the plot with log salary is much more compacted. A linear model is more appropriate for log base 10 mean salaries than for raw mean salaries because it transforms the variables to be linearly related such that it makes outliers not to look like outliers. Salary often has an exponential growth and is normally highly skewed therefore a log transformation counteracts the exponential growth to make it linear and give a better representation for the overall mean salary.
- [1 + 3 points] Fit a model of \(log_{10}\)(meansalary) as a function of yearID. Write the form of the model and explain what the Multiple R-Squared tells us.
linmod_salary<- lm(log10(meansalary)~yearID, data=Teamdata)
summary(linmod_salary)
Call:
lm(formula = log10(meansalary) ~ yearID, data = Teamdata)
Residuals:
Min 1Q Median 3Q Max
-0.66345 -0.11692 0.00644 0.13394 0.55976
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -51.222416 2.310867 -22.17 <2e-16 ***
yearID 0.028711 0.001152 24.92 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.1858 on 652 degrees of freedom
Multiple R-squared: 0.4878, Adjusted R-squared: 0.487
F-statistic: 620.9 on 1 and 652 DF, p-value: < 2.2e-16
linmod_salary$coefficients
(Intercept) yearID
-51.22241645 0.02871141
So the model we have built is:
\[
{\rm log(Mean salary)} \sim N(-51.222416 + 0.028711 \times {\rm yearID}, 0.1858)
\]
The model’s r-squared value is 0.4878. This means that 48.78% of the variability in the mean salary can be explained by yearID.
linmod_salary%>%
gg_diagnose(max.per.page = 1)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'







- [1 + 8 points] State and evaluate the four assumptions of linear models for this data.
linmod_salary%>%
gg_diagnose(max.per.page = 1)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'







We check if the model is reasonable by checking the following assumptions:
Linearity: here we look at the initial scatter plot of meansalary versus the yearID created before the model was built. This displayed a linear relationship between the predictor variable(yearID) and response(mean salary), so the linearity assumption is satisfied.
Homoscedasticity - scatter of residuals vs variables. In the plot of residuals vs yearID, there is a random scatter of points that is roughly the same width everywhere. This means that the spread of mean salary around the regression line is not dependent on the variable yearID. Therefore the assumption of homoscedasticity is satisfied.
Normality - here we consider the histogram of residuals. This looks roughly like a Gaussian curve and tells us the choice of the “gaussian” family of distributions is satisfied.
Normality - the normal QQplot is a straight line therefore our choice of distribution is good.
- [3 + 1 points] Plot confidence and prediction bands for this model. Colour the points according to who won the World Series each year. Comment on what you find.
Confidence band
Teamdata%>%
ggplot(mapping=aes(yearID, log10(meansalary), colour=WSWin))+
geom_smooth(method = "lm",colour='#2C3E50')+
geom_point(size=2)+
theme_classic()
`geom_smooth()` using formula 'y ~ x'

The above plot displays a grey band around the black regression line which is the confidence band, this grey band is linear. Most points in the plot are not in the grey band since the confidence band represents the uncertainty around the mean each Year, not the range of likely predicted values each Year. Our grey band is thin which suggests that the data observed isn’t likely to have come from another regression line.
Prediction band
pred1<-predict(linmod_salary,interval="prediction") #compute prediction bands
Warning in predict.lm(linmod_salary, interval = "prediction") :
predictions on current data refer to _future_ responses
my_team<-cbind(Teamdata,pred1) #add prediction bands to dataset
ggplot(my_team,aes(yearID,log10(meansalary), colour=WSWin))+
geom_point(size=2)+
geom_smooth(method=lm, color='#2C3E50')+
geom_line(aes(y=lwr), color=2,lty=2)+
geom_line(aes(y=upr), color=2,lty=2)+
theme_classic()
`geom_smooth()` using formula 'y ~ x'

The prediction band (represented by the dashed red lines) in the above plot gives us the range most “future” observations not included in the sample predictions should be in. This band will consider the uncertainty in the estimate of the mean and variance in the residuals.
- [1 + 1 points] Investigate the points that appear above the top prediction band. What team or teams do they relate to?
my_team%>%
mutate(log10(meansalary))%>%
filter(log10(meansalary)>upr) -> greater_point
greater_point<-as_tibble(greater_point)
The points that appear above the top predictio band relates to the team NYA
3. Multiple regression for Count Data
- [2 + 2 points] Create a histogram of the number of runs scored for players in the Playerdata dataset so each bar is a single value (0,1,2 runs, etc). Next create a histogram of the number of runs for all players who have had a hit. Give a domain-based and a data-based reason why it is more reasonable to create a Poisson data for the second set than the first.
Playerdata%>%
ggplot(aes(R))+
geom_histogram(binwidth = 1)+
labs(x="Number of runs",y="Frequency",title="Number of Runs by player")

Playerdata%>%
filter(H>0)%>%
ggplot(aes(R))+
geom_histogram(binwidth = 1)+
labs(x="Number of Hits",y="Frequency",title="Number of Runs by players with Hits")

It is more reasonable to create a Poisson data for the second set than the first because in the second set, we count the number of runs for players who have had a hit (H>0). In baseball, Runs are dependent on Hits, in order for a player to Run, they need to have Hit the ball. Additionally, No-hit games are relatively rare events therefore a poisson data is reasonable since poisson distributions will calculate the probability of Runs occurring in a fixed period of time if the events take place (Hits occuring) with a known constant mean rate and independently of each occurrence.
- [3 + 0 points] Create a new dataset, OnBase of all players who have had at least one hit. Transform yearID to a factor. Construct a Poisson model, glm1, of the number of runs as a function of the number of hits, the year as a factor, position played and player height and age.
OnBase<- Playerdata%>%
filter(H>0) %>%
mutate_at(vars(yearID),list(factor))
glm1<-glm(R~H+yearID+POS+height+age, data=OnBase,family="poisson")
summary(glm1)
Call:
glm(formula = R ~ H + yearID + POS + height + age, family = "poisson",
data = OnBase)
Deviance Residuals:
Min 1Q Median 3Q Max
-8.6607 -1.5962 -0.2627 1.0577 7.9257
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 2.465e+00 1.725e-01 14.290 < 2e-16 ***
H 1.249e-02 9.344e-05 133.625 < 2e-16 ***
yearID2015 2.382e-02 9.439e-03 2.524 0.011608 *
POS2B -1.880e-02 1.703e-02 -1.104 0.269509
POS3B 3.886e-04 1.605e-02 0.024 0.980688
POSC -7.246e-02 2.104e-02 -3.444 0.000573 ***
POSOF 5.958e-02 1.347e-02 4.425 9.65e-06 ***
POSP -1.195e+00 3.754e-02 -31.835 < 2e-16 ***
POSSS -1.400e-02 1.780e-02 -0.787 0.431573
height -2.594e-03 2.284e-03 -1.136 0.256078
age 5.119e-03 1.223e-03 4.185 2.85e-05 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for poisson family taken to be 1)
Null deviance: 35497.7 on 1347 degrees of freedom
Residual deviance: 5164.6 on 1337 degrees of freedom
AIC: 11544
Number of Fisher Scoring iterations: 5
- [2 + 4 points] Find the p-value for each of the predictor variables in this model using a Likelihood Ratio Test. What hypothesis does each p-value test, and what mathematically does a p-value tell you about a variable? Use this definition to say what is meant by the p-value associated to POS and to the p-value associated to height.
Anova(glm(R~H+yearID+POS+height+age, data=OnBase,family="poisson"))
Analysis of Deviance Table (Type II tests)
Response: R
LR Chisq Df Pr(>Chisq)
H 18921.1 1 < 2.2e-16 ***
yearID 6.4 1 0.01158 *
POS 1594.4 6 < 2.2e-16 ***
height 1.3 1 0.25620
age 17.5 1 2.948e-05 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
The p-value tests the null hypothesis in deciding whether to accept or reject the null hypothesis. Mathematically, the p-value tell us the importance of a variable in a model, where is it significant or not.In the output above, the asterisks next to the p-values defines the level of significance for each predictor.
The p-value for POS is 2.2e-16 which is very small and suggest that there is overwhelming evidence that it is important for predicting the number of Runs. The smaller the p-value, the stronger the evidence for rejecting the \(H_0\). Whereas the p-value for height is 0.10994 , this is >0.05 so we do not reject the \(H_0\). This has a large p value and suggests that it is not an important predictor for the number of Runs.
- [1 + 8 points] State the assumptions of Poisson models and check these where possible.
#We check for
#var(X) < E(X) underdisperssion
#var(X) > E(X) overdisperssion
plot(glm1,which=3)
abline(h=0.8,col=3)

The plot(red line) isn’t flat and rises above 0.8 and suggests over dispersion. The red line lies above the green line. Overdispersion suggests that we have not accounted for all the important predictors in this model. We now consider other assumptions of Poisson models are similar to those for Gaussian models, and evaluate them:
Linearity: this can be evaluated by exploring the (deviance) residuals vs fitted to check for a flat line. In this case, the line looks rather bent and not flat suggesting that the relationship is not linear.
plot(glm1,which=1)

Distribution: we evaluate the qqplot for deviance residuals. In the normal QQ plot below, we see that the line lies mostly within the points suggesting that confidence intervals are satisfied.
plot(glm1,which=2)

Since our model’s dispersion assumptions is not satisfied, it may be necessary to fit another model to the data. He we consider a Quassipoisson model which assumes that the dispersion is a linear function of the mean.
glm2<-glm(R~H+yearID+POS+height+age, data=OnBase,family="quasipoisson")
summary(glm2)
Call:
glm(formula = R ~ H + yearID + POS + height + age, family = "quasipoisson",
data = OnBase)
Deviance Residuals:
Min 1Q Median 3Q Max
-8.6607 -1.5962 -0.2627 1.0577 7.9257
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.4647047 0.3299001 7.471 1.43e-13 ***
H 0.0124855 0.0001787 69.861 < 2e-16 ***
yearID2015 0.0238223 0.0180539 1.320 0.1872
POS2B -0.0188013 0.0325680 -0.577 0.5638
POS3B 0.0003886 0.0307070 0.013 0.9899
POSC -0.0724642 0.0402426 -1.801 0.0720 .
POSOF 0.0595839 0.0257556 2.313 0.0208 *
POSP -1.1949425 0.0717945 -16.644 < 2e-16 ***
POSSS -0.0139961 0.0340375 -0.411 0.6810
height -0.0025940 0.0043687 -0.594 0.5528
age 0.0051186 0.0023395 2.188 0.0288 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for quasipoisson family taken to be 3.658467)
Null deviance: 35497.7 on 1347 degrees of freedom
Residual deviance: 5164.6 on 1337 degrees of freedom
AIC: NA
Number of Fisher Scoring iterations: 5
Here the estimated dispersion parameter, 3.666552, is indeed greater than 1, which confirms a mild overdispersion. Secondly, the model coefficients are the same as for the Poisson model, the only change is the dispersion estimate.
Independence: we don’t consider this in this model
- [2 + 4 points] Now create a new model that includes teamID as a random effect. Ensure there are no fit warnings. What does the result tell us about the importance of team on number of runs that players score? Is this a relatively large or small effect? How could we check the statistical significance of this effect in R?
rand_mod<-glmer(R~H+yearID+POS+height+age+(1|teamID),data=OnBase,family="poisson", nAGQ = 0)
summary(rand_mod)
Generalized linear mixed model fit by maximum likelihood (Adaptive Gauss-Hermite Quadrature, nAGQ = 0) ['glmerMod']
Family: poisson ( log )
Formula: R ~ H + yearID + POS + height + age + (1 | teamID)
Data: OnBase
AIC BIC logLik deviance df.resid
11267.2 11329.6 -5621.6 11243.2 1336
Scaled residuals:
Min 1Q Median 3Q Max
-6.4899 -1.4226 -0.2185 1.0116 10.8328
Random effects:
Groups Name Variance Std.Dev.
teamID (Intercept) 0.008301 0.09111
Number of obs: 1348, groups: teamID, 33
Fixed effects:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 2.551e+00 1.785e-01 14.290 < 2e-16 ***
H 1.265e-02 9.577e-05 132.034 < 2e-16 ***
yearID2015 3.537e-02 1.027e-02 3.445 0.000572 ***
POS2B -1.494e-02 1.732e-02 -0.863 0.388211
POS3B 4.997e-03 1.616e-02 0.309 0.757158
POSC -7.078e-02 2.115e-02 -3.346 0.000820 ***
POSOF 6.331e-02 1.357e-02 4.666 3.06e-06 ***
POSP -1.159e+00 3.773e-02 -30.725 < 2e-16 ***
POSSS -1.321e-02 1.793e-02 -0.737 0.461104
height -4.373e-03 2.353e-03 -1.858 0.063155 .
age 5.459e-03 1.282e-03 4.259 2.05e-05 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Correlation of Fixed Effects:
(Intr) H yID201 POS2B POS3B POSC POSOF POSP POSSS height
H -0.020
yearID2015 -0.024 0.137
POS2B -0.327 -0.003 -0.025
POS3B -0.200 0.024 0.008 0.462
POSC -0.126 0.123 0.022 0.340 0.352
POSOF -0.161 0.022 0.022 0.530 0.542 0.410
POSP 0.021 0.260 0.019 0.170 0.188 0.172 0.228
POSSS -0.259 0.004 -0.016 0.446 0.434 0.321 0.503 0.164
height -0.963 -0.095 -0.039 0.272 0.149 0.081 0.093 -0.070 0.196
age -0.241 0.197 0.091 0.123 0.066 0.038 0.095 0.055 0.154 0.007
anova(rand_mod)
Analysis of Variance Table
npar Sum Sq Mean Sq F value
H 1 22383.4 22383.4 22383.4275
yearID 1 9.8 9.8 9.7835
POS 6 1129.0 188.2 188.1703
height 1 3.6 3.6 3.5606
age 1 18.1 18.1 18.1385
\[
R = 2.617e+00+ H*1.300e-02+yearID2015*3.197e-02+\\POS2B*-9.808e-03+POS3B*9.049e-03+\\POSC*-6.275e-02+POSOF*6.560e-02+POSP*-1.138e\\+00+POSSS*-1.099e-02+height*-5.710e-03 +age* 4.783e-03
\]
normally, we take the exponential of oth sides of the eqn so here 2.617e+00+1.300e-02H + 3.197e-02yearID2015 + … so R = exp(2.617e+00)+exp(1.300e-02*H) + ….
What does the result tell us about the importance of team on number of runs that players score? Random effect -1.96\(tau\) = -1.960.0965 = -0.18914 1.96\(tau\) = 1.960.0965 = 0.18914 The interval for the random effect lies between [-0.18914, 0.18914] Zero is not included in the range for tau therefore the effect is significant.
Is this a relatively large or small effect?
Large is related to the coefficient and the unit as well range(min to max of that predictor) of predictor * coefficients e.g. for height we use 82-16 = 16coeff = 16-5.710e-03
How could we check the statistical significance of this effect in R? check how strong a random effect is on the model check the p vlaue
- [2 + 0 points] What is the mean number of runs could you expect 30-year old, 72 inch tall outfielders playing for the Baltimore Orioles in 2015 with 20 hits to have scored?
predict(rand_mod,newdata=data.frame(H=20, yearID="2015", POS="OF",height=72,age=30, teamID = "BAL", type="response") )
1
2.904727
Mean = 2.864136
4. Lasso Regression for Logistic Regression
- [4 + 0 points] Create a new dataset DivWinners by removing all of the variables that are team or park identifiers in the dataset, as well as ‘lgID’, ‘Rank’,‘franchID’,‘divID’, ‘WCWin’,‘LgWin’, and ‘WSwin’. Split the resulting into a training and a testing set so that the variable ‘DivWin’ is balanced between the two datasets. Use the seed 123.
Teamdata%>%
select(-contains("team"))%>%
select(-contains("park"))%>%
select(-lgID, -Rank, -franchID, -divID,-WCWin, -LgWin, -WSWin, -name) -> DivWinners
set.seed(123)
training.samples <- DivWinners$DivWin%>%
createDataPartition(p = 0.8, list = FALSE)
train.data <- DivWinners [training.samples, ]
test.data <- DivWinners [-training.samples, ]
- [4 + 0 points] Use the training data to fit a logistic regression model u sing the ‘glmnet’ command. Plot residual deviance against number of predictors.
div_vector<-as.vector(train.data$DivWin)
div_predict<-model.matrix(~.-1,train.data[,-6])
div_fit<- glmnet(div_predict,div_vector, family="binomial")
plot(div_fit,xvar="dev", ylim=c(-1,1))

- [2 + 2 points] How many nonzero model coefficients are needed to explain 50% of the deviance? 60%? Which coefficients are these in each case?
div_fit
Call: glmnet(x = div_predict, y = div_vector, family = "binomial")
Df %Dev Lambda
1 0 0.00 0.244500
2 1 6.30 0.222800
3 1 11.67 0.203000
4 1 16.31 0.184900
5 2 20.45 0.168500
6 2 24.13 0.153500
7 2 27.40 0.139900
8 2 30.31 0.127500
9 2 32.92 0.116100
10 2 35.27 0.105800
11 2 37.38 0.096430
12 2 39.29 0.087860
13 2 41.02 0.080060
14 2 42.58 0.072940
15 2 44.00 0.066460
16 2 45.27 0.060560
17 2 46.43 0.055180
18 2 47.47 0.050280
19 2 48.41 0.045810
20 2 49.26 0.041740
21 2 50.02 0.038030
22 2 50.70 0.034650
23 3 51.31 0.031580
24 3 51.91 0.028770
25 3 52.45 0.026220
26 3 52.92 0.023890
27 3 53.34 0.021760
28 3 53.71 0.019830
29 3 54.04 0.018070
30 3 54.33 0.016460
31 3 54.58 0.015000
32 4 54.80 0.013670
33 6 55.04 0.012450
34 7 55.29 0.011350
35 8 55.74 0.010340
36 9 56.15 0.009421
37 9 56.51 0.008584
38 12 56.84 0.007822
39 14 57.18 0.007127
40 14 57.52 0.006494
41 15 57.81 0.005917
42 15 58.10 0.005391
43 16 58.36 0.004912
44 18 58.61 0.004476
45 19 58.84 0.004078
46 19 59.04 0.003716
47 19 59.21 0.003386
48 20 59.36 0.003085
49 21 59.49 0.002811
50 22 59.63 0.002561
51 23 59.76 0.002334
52 25 59.93 0.002126
53 26 60.13 0.001937
54 27 60.29 0.001765
55 26 60.44 0.001609
56 27 60.61 0.001466
57 27 60.80 0.001335
58 28 61.04 0.001217
59 28 61.28 0.001109
60 28 61.48 0.001010
61 27 61.62 0.000920
62 29 61.79 0.000839
63 29 61.98 0.000764
64 30 62.21 0.000696
65 31 62.42 0.000634
66 31 62.60 0.000578
67 31 62.76 0.000527
68 31 62.89 0.000480
69 31 63.00 0.000437
70 32 63.10 0.000398
71 32 63.18 0.000363
72 32 63.25 0.000331
73 32 63.30 0.000301
74 32 63.35 0.000275
75 33 63.40 0.000250
76 33 63.43 0.000228
77 34 63.46 0.000208
78 34 63.49 0.000189
79 34 63.51 0.000172
80 34 63.53 0.000157
81 34 63.55 0.000143
82 34 63.56 0.000130
83 34 63.57 0.000119
84 35 63.58 0.000108
85 35 63.59 0.000099
86 35 63.60 0.000090
87 35 63.61 0.000082
88 36 63.62 0.000075
89 36 63.65 0.000068
90 35 63.66 0.000062
91 35 63.67 0.000056
92 35 63.68 0.000051
93 36 63.69 0.000047
94 37 63.73 0.000043
95 37 63.78 0.000039
96 37 63.81 0.000035
97 37 63.81 0.000032
Nonzero coefficients = 2 And these are = “W” and “L”
div_vals_50<- coef(div_fit, s=0.038030)
div_vals_50@Dimnames[[1]][1+div_vals_50@i]
[1] "(Intercept)" "W" "L"
div_vals_60<- coef(div_fit, s=0.001937)
div_vals_60@Dimnames[[1]][1+div_vals_60@i]
[1] "(Intercept)" "yearID" "Ghome" "W" "L" "AB" "H" "X2B" "X3B"
[10] "HR" "BB" "SO" "SB" "CS" "HBP" "SF" "RA" "CG"
[19] "SV" "HA" "HRA" "BBA" "SOA" "DP" "FP" "attendance" "PPF"
[28] "rostersize"
At 60% we have 27 non-zero coefficients and these are: “yearID”,“Ghome”,“W”,“L”, “AB”, “H”, “X2B”, “X3B”, “HR”,“BB”, “SO”,“SB”, “CS” ,“HBP”,“SF”, “RA”, “CG” ,“SV”, “HA”, “HRA”, “BBA”, “SOA”, “DP”, “FP”, “attendance”, “PPF”,“rostersize”
- [2 + 1 points] Now use cross-validation to choose a moderately conservative model. State the variables you will include.
set.seed(123)
div_cv<- cv.glmnet(div_predict, div_wins, family="binomial")
plot(div_cv)

A conservative model opts for a model that is effective even with lower number of variables where we want the Mean Square Error to be as small as possible. Here, we expect to use 2-3 variables in the model as seen in the dashed interval.
Finding the new coefficients at the lowest value of lamba
div_fit_min<-coef(div_fit,s=div_cv$lambda.1se)
setdiff(div_fit_min@Dimnames[[1]][1+div_fit_min@i],div_vals_50@Dimnames[[1]][1+div_vals_50@i])
[1] "attendance"
From here, we obtain the covariant “attendance” So we need to consider which of the remaining factors to include, based on how many levels are selected by the smallest value of lambda we would consider:
When we include all the variables, the max % we get is 63.81%, so it is not necessary to include all variables in the model so it is better to include the following 3 variables.
div_fit_max<-coef(div_fit,s=div_cv$lambda.min)
div_fit_max@Dimnames[[1]][1+div_fit_max@i]
[1] "(Intercept)" "W" "L" "attendance"
- [4 + 2 points] Fit the model on the training data, then predict on the testing data. Plot comparative ROC curves and summarise your findings.
div_win_model<-glm(as.factor(DivWin)~W+L+attendance,family="binomial",data=train.data)
summary(div_win_model)
Call:
glm(formula = as.factor(DivWin) ~ W + L + attendance, family = "binomial",
data = train.data)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.56928 -0.30288 -0.05868 -0.00847 2.62874
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 8.621e+00 6.996e+00 1.232 0.2179
W 9.094e-02 4.563e-02 1.993 0.0463 *
L -2.591e-01 5.138e-02 -5.042 4.6e-07 ***
attendance 4.262e-07 2.698e-07 1.580 0.1141
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 527.73 on 523 degrees of freedom
Residual deviance: 231.36 on 520 degrees of freedom
AIC: 239.36
Number of Fisher Scoring iterations: 7
div_win_model2<-glm(as.factor(DivWin)~W+L,family="binomial",data=train.data)
summary(div_win_model2)
Call:
glm(formula = as.factor(DivWin) ~ W + L, family = "binomial",
data = train.data)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.70863 -0.28622 -0.05751 -0.00800 2.66915
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 7.31895 6.93618 1.055 0.2913
W 0.11019 0.04406 2.501 0.0124 *
L -0.24860 0.05063 -4.911 9.08e-07 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 527.73 on 523 degrees of freedom
Residual deviance: 233.89 on 521 degrees of freedom
AIC: 239.89
Number of Fisher Scoring iterations: 7
To compare the 2 models: we check their ROC curves(to see the effect of an additional variable)
mod1<- (predict(div_win_model, type="response"))
mod2<- (predict(div_win_model2, type="response"))
predtrain_mod<-predict(div_win_model,type="response")
predtest_mod<-predict(div_win_model2,newdata=test.data,type="response")
rocmodels<-roc(response=train.data$DivWin,predictor=predtrain_mod,plot=TRUE,main="ROC Curve for DivWin",auc=TRUE)
Setting levels: control = N, case = Y
Setting direction: controls < cases
roc(response=test.data$DivWin,predictor=predtest_mod,plot=TRUE,auc=TRUE,add=TRUE,col=2)
Setting levels: control = N, case = Y
Setting direction: controls < cases
Call:
roc.default(response = test.data$DivWin, predictor = predtest_mod, auc = TRUE, plot = TRUE, add = TRUE, col = 2)
Data: predtest_mod in 104 controls (test.data$DivWin N) < 26 cases (test.data$DivWin Y).
Area under the curve: 0.9451
legend(0,0.4,legend=c("mod1","mod2"),fill=1:2)

As seen in the plot, the ROC curve for mod2 with an additional variable attendance, slightly performs better than than mod2 with only 2 variables L and W.
Now we predict Here we predict on the
predtrain<-predict(div_win_model,type="response")
predtest<-predict(div_win_model,newdata=test.data,type="response")
roctrain<-roc(response=train.data$DivWin,predictor=predtrain,plot=TRUE,main="ROC Curve for DivWin",auc=TRUE)
Setting levels: control = N, case = Y
Setting direction: controls < cases
roctest<- roc(response=test.data$DivWin,predictor=predtest,plot=TRUE,auc=TRUE,add=TRUE,col=2)
Setting levels: control = N, case = Y
Setting direction: controls < cases
legend(0,0.4,legend=c("training","testing"),fill=1:2)

Testing ROC performed better than the training ROC. These two curves are very similar in shape. This is a good sign that the data is not overfitted to the training data.
And finally we compare confusion matrices and sums of sensitivities.
- [4 + 2 points] Find Youden’s index for the training data and calculate confusion matrices at this cutoff for both training and testing data. Comment on the quality of the model for prediction in terms of false negative and false positive rates for the testing data.
Youden’s index For training data
youden_win_train<-coords(roctrain,"b",best.method="youden",transpose=TRUE)
youden_win_train
threshold specificity sensitivity
0.1836071 0.8468900 0.9433962
youden_win_train[2]+youden_win_train[3]
specificity
1.790286
So the best threshold is right around 0.18, and this achieves a sensitivity+specificity of 1.79 on this data, which is certainly an improvement on no model
For testing data
youden_win_test<-coords(roctest,x = 0.1836071 ,transpose=TRUE)
youden_win_test
threshold specificity sensitivity
0.1836071 0.8365385 0.9615385
youden_win_test[2]+youden_win_test[3]
specificity
1.798077
The sensitivity increased from 0.9433962 in the training data to 0.9615385 in the testing data whilst the specificity had a slight decrease from 0.8468900 to 0.8365385. Overall, there isn’t much loss of sensitivity+specificity.
Confusion matrix For training data
anewdata=train.data[is.na(train.data$W)==FALSE,]
anewdata$pred_id<-ifelse(predict(div_win_model,newdata=anewdata, type="response")>= 0.1836071 ,"y","n")
table(anewdata$pred_id,as.factor(anewdata$DivWin))
N Y
n 354 6
y 64 100
The columns here are the true values and the rows are the predictions. It identified 354 N’s correctly out of 418 and 100 Y’s correctly out of 106.
For testing data
anewdata_test=test.data[is.na(test.data$DivWin)==FALSE,]
anewdata_test$pred_id2<-ifelse(predict(div_win_model,newdata=anewdata_test, type="response")>= 0.1836071 ,"y","n")
table(anewdata_test$pred_id2,as.factor(anewdata_test$DivWin))
N Y
n 87 1
y 17 25
- [5 + 1 points] Calculate the sensitivity+specificity on the testing data as a function of divID and plot as a barchart. Is the prediction equally good for all divisions?
Teamdata%>%
select(-contains("team"))%>%
select(-contains("park"))%>%
select(-lgID, -Rank, -franchID,-WCWin, -LgWin, -WSWin, -name) -> DivWinners2
DivWinners2<- DivWinners2%>%
mutate_at(vars(divID),list(factor))
set.seed(123)
training.samples2 <- DivWinners2$divID%>%
createDataPartition(p = 0.8, list = FALSE)
train.data2 <- DivWinners2[training.samples2, ]
test.data2 <- DivWinners2 [-training.samples2, ]
#For C
roctrain_c<-roc(response=train.data2$divID=="C",predictor=predtrain,plot=TRUE,main="ROC Curve for DivWin",auc=TRUE)
Setting levels: control = FALSE, case = TRUE
Setting direction: controls < cases
roctest_c<- roc(response=test.data2$divID=="C",predictor=predtest,plot=TRUE,auc=TRUE,add=TRUE,col=2)
Setting levels: control = FALSE, case = TRUE
Setting direction: controls > cases
legend(0,0.4,legend=c("training","testing"),fill=1:2)

youden_id_train_c<-coords(roctrain_c,"b",best.method="youden",transpose=TRUE)
youden_id_train_c
threshold specificity sensitivity
0.01842452 0.53273810 0.59042553
youden_id_train_c[2]+youden_id_train_c[3]
specificity
1.123164
youden_id_test_c<-coords(roctest_c,x=0.01842452 ,transpose=TRUE)
youden_id_test_c
threshold specificity sensitivity
0.01842452 0.50602410 0.57446809
youden_c<- youden_id_test_c[2]+youden_id_test_c[3]
#For E
roctrain_e<-roc(response=train.data2$divID=="E",predictor=predtrain,plot=TRUE,main="ROC Curve for DivWin",auc=TRUE)
Setting levels: control = FALSE, case = TRUE
Setting direction: controls > cases
roctest_e<- roc(response=test.data2$divID=="E",predictor=predtest,plot=TRUE,auc=TRUE,add=TRUE,col=2)
Setting levels: control = FALSE, case = TRUE
Setting direction: controls > cases
legend(0,0.4,legend=c("training","testing"),fill=1:2)

youden_id_train_e<-coords(roctrain_e,"b",best.method="youden",transpose=TRUE)
youden_id_train_e
threshold specificity sensitivity
0.3821811 0.2356322 0.8352273
youden_id_train_e[2]+youden_id_train_e[3]
specificity
1.070859
youden_id_test_e<-coords(roctest_e,x=0.3821811 ,transpose=TRUE)
youden_id_test_e
threshold specificity sensitivity
0.3821811 0.2441860 0.7272727
youden_e<- youden_id_test_e[2]+youden_id_test_e[3]
#For W
roctrain_w<-roc(response=train.data2$divID=="W",predictor=predtrain,plot=TRUE,main="ROC Curve for DivWin",auc=TRUE)
Setting levels: control = FALSE, case = TRUE
Setting direction: controls > cases
roctest_w<- roc(response=test.data2$divID=="W",predictor=predtest,plot=TRUE,auc=TRUE,add=TRUE,col=2)
Setting levels: control = FALSE, case = TRUE
Setting direction: controls < cases
legend(0,0.4,legend=c("training","testing"),fill=1:2)

youden_id_train_w<-coords(roctrain_e,"b",best.method="youden",transpose=TRUE)
youden_id_train_w
threshold specificity sensitivity
0.3821811 0.2356322 0.8352273
youden_id_train_w[2]+youden_id_train_w[3]
specificity
1.070859
youden_id_test_w<-coords(roctest_w,x=0.3821811 ,transpose=TRUE)
youden_id_test_w
threshold specificity sensitivity
0.3821811 0.7582418 0.2820513
youden_w<- youden_id_test_w[2]+youden_id_test_w[3]
youden_c
specificity
1.080492
youden_e
specificity
0.9714588
youden_w
specificity
1.040293
divid_types <- c("E", "W", "C")
sens_spec <- c(youden_e,youden_w,youden_c )
bar_tab<- as_tibble(data.frame(divid_types ,sens_spec ))
ggplot(bar_tab, aes(divid_types, sens_spec ))+
geom_col()+
theme_classic()

---
title: "Coursework_MAP501_2021"
output:
  pdf_document: default
  html_notebook: default
---

```{r, message = FALSE, warning = FALSE}
library("car")
library("tidyverse")
library("magrittr")
library("here")
library("janitor")
library("lubridate")
library("gridExtra")
library("readxl")
library("glmnet")
library("Lahman")
library("viridis")
library("lindia")
library("lme4")
library("caret")
library("pROC")
```

# 1. Datasets

a.  [3 + 0 points] Create a new dataset called 'Peopledata' that contains all of the variables in the 'People' dataset by

    i. removing all birth information except birthYear and birthCountry and all death information, along with the variable finalGame;
    
```{r}
People<-as_tibble(Lahman::People)

People<- People%>%
  select(playerID,birthYear,birthCountry,nameFirst,nameLast,weight,height,bats,throws,debut)
```
 
    ii. replacing birthCountry is by bornUSA, a logical variable indicating if the player was born in the USA;
    
```{r}
People%>%
  mutate(bornUSA = birthCountry=="USA")%>%
  select(!birthCountry) -> Peopledata
```
    

b.  [5 + 0 points] Create new datasets called Battingdata and Fieldingdata by 

    i. choosing data from the years 1985 and 2015,
    
```{r}
Batting<- as_tibble(Lahman::Batting)
Batting<- Batting%>%
  filter(yearID == 1985 |yearID== 2015)

Fielding<- as_tibble(Lahman::Fielding)
Fielding<- Fielding%>%
  filter(yearID == 1985 |yearID== 2015)
  
```
    
    ii. selecting only those variables that for those years have fewer than 25 missing cases, 
    
```{r}
Batting%>%
  sapply(function(x) sum(is.na(x)) )
```

None of the Batting dataset variables have fewer than 25 missing cases
    
```{r}
Fielding%>%
  sapply(function(x) sum(is.na(x)) )
  
Fielding<- Fielding%>%
  select(-PB, -WP, -SB, -CS, -ZR)
```
    
    iii. removing the variable 'G' from the batting dataset and removing the variables "teamID" and "lgID" from both datasets, 
    
```{r}
Fielding%>%
  select(-teamID, -lgID)
Fieldingdata<- Fielding
```

 
```{r}
Battingdata<- Batting%>%
  select(-G, -teamID, -lgID)
```

   
    
    iv. creating a variable in 'Battingdata' called batav which is equal to the number of hits (H) over the number of at bats (AB) if the number of hits &gt;0, and =0 if H=0.
    
```{r}
Battingdata<- Battingdata%>%
  mutate(batav = 
             if_else(Battingdata$H > 0 ,Battingdata$H/Battingdata$AB,0))
```

  

c.  [6 + 0 points] Create a dataset 'Playerdata' from the dataset 'Salaries' by 
    
    i. selecting data from the years 1985 and 2015, 
    
```{r}
Playerdata<- as_tibble(Lahman::Salaries)

Playerdata<- Playerdata%>%
  filter(yearID %in% c("1985","2015"))
```
    
    
    ii. adding all distinct variables from the Fieldingdata, Battingdata and Peopledata datasets,
    
```{r}
left_join(Playerdata, Fieldingdata)-> Playerdata
left_join(Playerdata, Battingdata)-> Playerdata
left_join(Playerdata, Peopledata)-> Playerdata

```

   
    iii. creating a new variable 'allstar' indicating if the player appears anywhere in the AllstarFull dataset,
    
```{r}
AllstarFull<-as_tibble(Lahman::AllstarFull)

Playerdata<- Playerdata%>%
  mutate(allstar=playerID%in% AllstarFull$playerID)

```
    
    iv. creating a new variable 'age' equal to each player's age in the relevant year,
```{r}
Playerdata<- Playerdata%>%
  mutate(age = yearID-birthYear)
```

   
    iv. dropping incomplete cases from the dataset,
    
```{r}
Playerdata<- Playerdata%>%
  drop_na()
```


    v. dropping unused levels of any categorical variable.
    
```{r}
Playerdata<- Playerdata%>%
  droplevels()

#teamID had 33 levels
#lgID had 2 levels
#bats had 3 levels
#throws had 3 levels
```
    

d.  [4 + 0 points] Create a dataset called 'TeamSalaries' in which there is a row for each team and each year and the variables are:
    
    i. 'Rostercost' = the sum of all player salaries for the given team in the given year
    
    ii. 'meansalary' = the mean salary for that team that year
    
    
    iii. 'rostersize' = the number of players listed that year for that team.


```{r}

as_tibble(Lahman::Salaries)%>%
  group_by(teamID,yearID)%>%
  summarise(Roster = sum(salary), meansalary = mean(salary), rostersize = n() )-> TeamSalaries
#roster size check different method later
```




e. [2 + 0 points] Create a dataset 'Teamdata' by taking the data from the Teams dataset for the years 1984 to 2016, inclusive and adding to that data the variables in TeamSalaries. Drop any incomplete cases from the dataset.
```{r}
Teams<- as_tibble(Lahman::Teams)

Teams<- Teams%>%
  filter(yearID %in% c(1984:2016))
  
left_join(Teams, TeamSalaries)-> Teamdata

Teamdata<- Teamdata%>%
  drop_na()
  
```


# 2. Simple Linear Regression

a.  [2 + 2 points] Create one plot of mean team salaries over time from 1984 to 2016, and another of the log base 10 of team mean salaries over time from 1984 to 2016. Give two reasons why a linear model is more appropriate for log base 10 mean salaries than for raw mean salaries.

```{r}
Teamdata%>%
  ggplot(mapping=aes(yearID, meansalary))+
  geom_point()+
  labs(x = "Year ", y = "Mean salary") +
  ggtitle("Mean salary over the years")+
  geom_smooth(method = "lm", se = FALSE,colour="red")+
  theme_classic()
```

The above plot illustrates an upward drift of points from the left to right, where the mean salary is seen to be increasing over the years. 

```{r}
Teamdata%>%
  ggplot(mapping=aes(yearID, log10(meansalary)))+
  geom_smooth(method = "lm", se = FALSE,colour="red")+
  geom_point()+
  labs(x = "Year ", y = "Mean log salary") +
  ggtitle("Mean log salary over the years") +
  theme_classic()
```

Here we can see that there is a linear relationship between log(mean salary) and yearID as illustrated by the trendline. 
The spread in the plot without log salary is wider whereas the plot with log salary is much more compacted. 
A linear model is more appropriate for log base 10 mean salaries than for raw mean salaries because  it transforms the variables to be linearly related such that it makes outliers not to look like outliers.
Salary often has an exponential growth and is normally highly skewed therefore a log transformation counteracts the exponential growth to make it linear and give a better representation for the overall mean salary. 


b. [1 + 3 points] Fit a model of $log_{10}$(meansalary) as a function of yearID.  Write the form of the model and explain what the Multiple R-Squared tells us.

```{r}
linmod_salary<- lm(log10(meansalary)~yearID, data=Teamdata)
summary(linmod_salary)

linmod_salary$coefficients

```

So the model we have built is:

$$
{\rm log(Mean salary)} \sim N(-51.222416 + 0.028711 \times {\rm yearID}, 0.1858)
$$

The model's r-squared value is 0.4878. This means that 48.78% of the variability in the mean salary can be explained by yearID. 


```{r}
linmod_salary%>%
  gg_diagnose(max.per.page = 1)
```


c.  [1 + 8 points] State and evaluate the four assumptions of linear models for this data.

```{r}
linmod_salary%>%
  gg_diagnose(max.per.page = 1)
```


We check if the model is reasonable by checking the following assumptions:

Linearity: here we look at the initial scatter plot of meansalary versus the yearID created before the model was built. This displayed a linear relationship between the predictor variable(yearID) and response(mean salary), so the linearity assumption is satisfied. 

Homoscedasticity - scatter of residuals vs variables. In the plot of residuals vs yearID, there is a random scatter of points that is roughly the same width everywhere. This means that the spread of mean salary around the regression line is not dependent on the variable yearID. Therefore the assumption of _homoscedasticity_ is satisfied. 

Normality - here we consider the histogram of residuals. This looks roughly like a Gaussian curve and tells us the choice of the "gaussian" family of distributions is satisfied. 

Normality - the normal QQplot is a straight line therefore our choice of distribution is good. 


d.  [3 + 1 points] Plot confidence and prediction bands for this model.  Colour the points according to who won the World Series each year.  Comment on what you find.


*Confidence band*

```{r}
Teamdata%>%
  ggplot(mapping=aes(yearID, log10(meansalary), colour=WSWin))+
  geom_smooth(method = "lm",colour='#2C3E50')+
  geom_point(size=2)+
  theme_classic()
```

The above plot displays a grey band around the black regression line which is the confidence band, this grey band is linear. Most points in the plot are not in the grey band since the confidence band represents the uncertainty around the mean each Year, not the range of likely predicted values each Year. Our grey band is thin which suggests that the data observed isn't likely to have come from another regression line. 

*Prediction band*

```{r}
pred1<-predict(linmod_salary,interval="prediction") #compute prediction bands
my_team<-cbind(Teamdata,pred1)           #add prediction bands to dataset
ggplot(my_team,aes(yearID,log10(meansalary), colour=WSWin))+
        geom_point(size=2)+
         geom_smooth(method=lm, color='#2C3E50')+
         geom_line(aes(y=lwr), color=2,lty=2)+
         geom_line(aes(y=upr), color=2,lty=2)+
  theme_classic()
```

The prediction band (represented by the dashed red lines) in the above plot gives us the range most “future” observations not included in the sample predictions should be in. This band will consider the uncertainty in the estimate of the mean and variance in the residuals. 

 e. [1 + 1 points] Investigate the points that appear above the top prediction band.  What team or teams do they relate to?
 
```{r}
my_team%>%
mutate(log10(meansalary))%>%
filter(log10(meansalary)>upr) -> greater_point
  
greater_point<-as_tibble(greater_point)
```

The points that appear above the top predictio band relates to the team NYA


# 3. Multiple regression for Count Data

a. [2 + 2 points] Create a histogram of the number of runs scored for players in the Playerdata dataset so each bar is a single value (0,1,2 runs, etc).  Next create a histogram of the number of runs for all players who have had a hit. Give a domain-based and a data-based reason why it is more reasonable to create a Poisson data for the second set than the first.  

```{r}
Playerdata%>%   
  ggplot(aes(R))+
  geom_histogram(binwidth = 1)+
  labs(x="Number of runs",y="Frequency",title="Number of Runs by player")
```


```{r}
Playerdata%>%
  filter(H>0)%>%
  ggplot(aes(R))+
  geom_histogram(binwidth = 1)+
  labs(x="Number of Hits",y="Frequency",title="Number of Runs by players with Hits")
```


It is more reasonable to create a Poisson data for the second set than the first because in the second set, we count the number of runs for players who have had a hit (H>0). In baseball, Runs are dependent on Hits, in order for a player to Run, they need to have Hit the ball. Additionally, No-hit games are relatively rare events therefore a poisson data is reasonable since poisson distributions will calculate the probability of Runs occurring in a fixed period of time if the events take place (Hits occuring) with a known constant mean rate and independently of each occurrence.



b.  [3 + 0 points] Create a new dataset, OnBase of all players who have had at least one hit.  Transform yearID to a factor.  Construct a Poisson model, glm1, of the number of runs as a function of the number of hits, the year as a factor, position played and player height and age.

```{r}
OnBase<- Playerdata%>% 
  filter(H>0) %>% 
  mutate_at(vars(yearID),list(factor))

glm1<-glm(R~H+yearID+POS+height+age, data=OnBase,family="poisson")
summary(glm1)
  
```

c.  [2 + 4 points] Find the p-value for each of the predictor variables in this model using a Likelihood Ratio Test. What hypothesis does each p-value test, and what mathematically does a p-value tell you about a variable?  Use this definition to say what is meant by the p-value associated to POS and to the p-value associated to height.

```{r}
Anova(glm(R~H+yearID+POS+height+age, data=OnBase,family="poisson"))
```

The p-value tests the null hypothesis in deciding whether to accept or reject the null hypothesis. 
Mathematically, the p-value tell us the importance of a variable in a model, where is it significant or not.In the output above, the asterisks next to the p-values defines the level of significance for each predictor.  
The p-value for POS is 2.2e-16 which is very small and suggest that there is overwhelming evidence that it is important for predicting the number of Runs. The smaller the p-value, the stronger the evidence for rejecting the $H_0$.  Whereas the p-value for height is 0.10994 , this is >0.05 so we do not reject the $H_0$. This has a large p value and suggests that it is not an important predictor for the number of Runs. 

d. [1 + 8 points] State the assumptions of Poisson models and check these where possible.

```{r}
#We check for 
#var(X) < E(X) underdisperssion
#var(X) > E(X) overdisperssion

plot(glm1,which=3)
abline(h=0.8,col=3)
```


The plot(red line) isn't flat and rises above 0.8 and suggests over dispersion. The red line lies above the green line.
Overdispersion suggests that we have not accounted for all the important predictors in this model. 
We now consider other assumptions of Poisson models are similar to those for Gaussian models, and evaluate them:

Linearity: this can be evaluated by exploring the (deviance) residuals vs fitted to check for a flat line. In this case, the line looks rather bent and not flat suggesting that the relationship is not linear. 

```{r}
plot(glm1,which=1)
```

Distribution: we evaluate the qqplot for deviance residuals. In the normal QQ plot below, we see that the line lies mostly within the points suggesting that confidence intervals are satisfied. 

```{r}
plot(glm1,which=2)
```


Since our model's dispersion assumptions is not satisfied, it may be necessary to fit another model to the data. He we consider a Quassipoisson model which assumes that the dispersion is a linear function of the mean. 

```{r}
glm2<-glm(R~H+yearID+POS+height+age, data=OnBase,family="quasipoisson")
summary(glm2)
```

Here the estimated dispersion parameter, 3.666552, is indeed greater than 1, which confirms a mild overdispersion. Secondly, the model coefficients are the same as for the Poisson model, the only change is the dispersion estimate.

Independence: we don't consider this in this model




e. [2 + 4 points] Now create a new model that includes teamID as a random effect.  Ensure there are no fit warnings. What does the result tell us about the importance of team on number of runs that players score?  Is this a relatively large or small effect?  How could we check the statistical significance of this effect in R?

```{r}
rand_mod<-glmer(R~H+yearID+POS+height+age+(1|teamID),data=OnBase,family="poisson", nAGQ = 0)
summary(rand_mod)
```


```{r}
anova(rand_mod)
```

$$
R = 2.617e+00+ H*1.300e-02+yearID2015*3.197e-02+\\POS2B*-9.808e-03+POS3B*9.049e-03+\\POSC*-6.275e-02+POSOF*6.560e-02+POSP*-1.138e\\+00+POSSS*-1.099e-02+height*-5.710e-03 +age* 4.783e-03
$$

normally, we take the exponential of oth sides of the eqn
so here 
2.617e+00+1.300e-02*H + 3.197e-02*yearID2015 + ...
so 
R = exp(2.617e+00)+exp(1.300e-02*H) + ....

*What does the result tell us about the importance of team on number of runs that players score?*
*Random effect*
-1.96*$tau$ = -1.96*0.0965 = -0.18914
1.96*$tau$ = 1.96*0.0965 = 0.18914
The interval for the random effect lies between [-0.18914, 0.18914]
Zero is not included in the range for tau therefore the effect is significant. 

*Is this a relatively large or small effect? *

Large is related to the coefficient and the unit as well
range(min to max of that predictor) of predictor * coefficients
e.g. for height we use 82-16 = 16*coeff = 16*-5.710e-03

*How could we check the statistical significance of this effect in R?*
check how strong a random effect is on the model
check the p vlaue 


f. [2 + 0 points] What is the mean number of runs could you expect 30-year old, 72 inch tall outfielders playing for the Baltimore Orioles in 2015 with 20 hits to have scored?  

```{r}
predict(rand_mod,newdata=data.frame(H=20, yearID="2015", POS="OF",height=72,age=30, teamID = "BAL", type="response") )
```



Mean = 2.864136 

# 4.  Lasso Regression for Logistic Regression

a. [4 + 0 points] Create a new dataset DivWinners by removing all of the variables that are team or park identifiers in the dataset, as well as 'lgID', 'Rank','franchID','divID', 'WCWin','LgWin', and 'WSwin'.
Split the resulting into a training and a testing set so that the variable 'DivWin' is balanced between the two datasets.  Use the seed 123.

```{r}
Teamdata%>%
  select(-contains("team"))%>%
  select(-contains("park"))%>%
  select(-lgID, -Rank, -franchID, -divID,-WCWin, -LgWin, -WSWin, -name) -> DivWinners 
```


```{r}
set.seed(123)
training.samples <- DivWinners$DivWin%>%
  createDataPartition(p = 0.8, list = FALSE)
train.data  <- DivWinners [training.samples, ]
test.data <- DivWinners [-training.samples, ]
```

b.  [4 + 0 points] Use the training data to fit a logistic regression model u sing the 'glmnet' command.  Plot residual deviance against number of predictors.  

```{r}
div_vector<-as.vector(train.data$DivWin)
div_predict<-model.matrix(~.-1,train.data[,-6])
```

```{r}
div_fit<- glmnet(div_predict,div_vector, family="binomial")
```


```{r}
plot(div_fit,xvar="dev", ylim=c(-1,1))
```

c.  [2 + 2 points] How many nonzero model coefficients are needed to explain 50% of the deviance? 60%?  Which coefficients are these in each case?  
```{r}
div_fit
```

Nonzero coefficients = 2
And these are = "W" and "L"

```{r}
div_vals_50<- coef(div_fit, s=0.038030)
div_vals_50@Dimnames[[1]][1+div_vals_50@i]
```

```{r}
div_vals_60<- coef(div_fit, s=0.001937)
div_vals_60@Dimnames[[1]][1+div_vals_60@i]
```

At 60% we have 27 non-zero coefficients and these are:
"yearID","Ghome","W","L", "AB", "H", "X2B", "X3B", "HR","BB", "SO","SB", "CS" ,"HBP","SF", "RA", "CG" ,"SV", "HA", "HRA", "BBA", "SOA", "DP", "FP", "attendance", "PPF","rostersize" 


d.  [2 + 1 points] Now use cross-validation to choose a moderately conservative model.  State the variables you will include.

```{r}
set.seed(123)
div_cv<- cv.glmnet(div_predict, div_wins, family="binomial")
plot(div_cv)
```
A conservative model opts for a model that is effective even with lower number of variables where we want the Mean Square Error to be as small as possible. 
Here, we expect to use 2-3 variables in the model as seen in the dashed interval. 


Finding the new coefficients at the lowest value of lamba
```{r}
div_fit_min<-coef(div_fit,s=div_cv$lambda.1se)
setdiff(div_fit_min@Dimnames[[1]][1+div_fit_min@i],div_vals_50@Dimnames[[1]][1+div_vals_50@i])
```
From here, we obtain the covariant "attendance"
So we need to consider which of the remaining factors to include, based on how many levels are selected by the smallest value of lambda we would consider:

When we include all the variables, the max % we get is 63.81%, so it is not necessary to include all variables in the model so it is better to include the following 3 variables. 

```{r}
div_fit_max<-coef(div_fit,s=div_cv$lambda.min)
div_fit_max@Dimnames[[1]][1+div_fit_max@i]
```




e.  [4 + 2 points] Fit the model on the training data, then predict on the testing data.  Plot comparative ROC curves and summarise your findings.


```{r}
div_win_model<-glm(as.factor(DivWin)~W+L+attendance,family="binomial",data=train.data)
summary(div_win_model)
```

```{r}
div_win_model2<-glm(as.factor(DivWin)~W+L,family="binomial",data=train.data)
summary(div_win_model2)
```


To compare the 2 models:
we check their ROC curves(to see the effect of an additional variable)
```{r}
mod1<- (predict(div_win_model, type="response"))
```

```{r}
mod2<- (predict(div_win_model2, type="response"))
```

```{r}
predtrain_mod<-predict(div_win_model,type="response")
predtest_mod<-predict(div_win_model2,newdata=test.data,type="response")
```


```{r}
rocmodels<-roc(response=train.data$DivWin,predictor=predtrain_mod,plot=TRUE,main="ROC Curve for DivWin",auc=TRUE)
roc(response=test.data$DivWin,predictor=predtest_mod,plot=TRUE,auc=TRUE,add=TRUE,col=2)
legend(0,0.4,legend=c("mod1","mod2"),fill=1:2)
```

As seen in the plot, the ROC curve for mod2 with an additional variable attendance, slightly performs better than than mod2 with only 2 variables L and W. 

*Now we predict*
Here we predict on the 
```{r}
predtrain<-predict(div_win_model,type="response")
predtest<-predict(div_win_model,newdata=test.data,type="response")
```

```{r}
roctrain<-roc(response=train.data$DivWin,predictor=predtrain,plot=TRUE,main="ROC Curve for DivWin",auc=TRUE)
roctest<- roc(response=test.data$DivWin,predictor=predtest,plot=TRUE,auc=TRUE,add=TRUE,col=2)
legend(0,0.4,legend=c("training","testing"),fill=1:2)
```

Testing ROC performed better than the training ROC. These two curves are very similar in shape.  This is a good sign that the data is not overfitted to the training data.


And finally we compare confusion matrices and sums of sensitivities.


f.  [4 + 2 points] Find Youden's index for the training data and calculate confusion matrices at this cutoff for both training and testing data.  Comment on the quality of the model for prediction in terms of false negative and false positive rates for the testing data.

*Youden's index*
*For training data*
```{r}
youden_win_train<-coords(roctrain,"b",best.method="youden",transpose=TRUE)
youden_win_train
youden_win_train[2]+youden_win_train[3]
```
So the best threshold is right around 0.18, and this achieves a  sensitivity+specificity of 1.79 on this data, which is certainly an improvement on no model


*For testing data*
```{r}
youden_win_test<-coords(roctest,x = 0.1836071 ,transpose=TRUE)
youden_win_test
youden_win_test[2]+youden_win_test[3]
```
The sensitivity increased from 0.9433962 in the training data to 0.9615385 in the testing data whilst the specificity had a slight decrease from 
0.8468900 to 0.8365385. Overall, there isn't much loss of sensitivity+specificity. 


*Confusion matrix*
*For training data*
```{r}
anewdata=train.data[is.na(train.data$W)==FALSE,]
anewdata$pred_id<-ifelse(predict(div_win_model,newdata=anewdata, type="response")>=  0.1836071 ,"y","n")
table(anewdata$pred_id,as.factor(anewdata$DivWin))
```
The columns here are the true values and the rows are the predictions. It identified 354 N's correctly out of 418 and 100 Y's correctly out of 106. 

*For testing data*
```{r}
anewdata_test=test.data[is.na(test.data$DivWin)==FALSE,]
anewdata_test$pred_id2<-ifelse(predict(div_win_model,newdata=anewdata_test, type="response")>=  0.1836071 ,"y","n")
table(anewdata_test$pred_id2,as.factor(anewdata_test$DivWin))
```



g.  [5 + 1 points] Calculate the sensitivity+specificity on the testing data as a function of divID and plot as a barchart.  Is the prediction equally good for all divisions?


```{r}
Teamdata%>%
  select(-contains("team"))%>%
  select(-contains("park"))%>%
  select(-lgID, -Rank, -franchID,-WCWin, -LgWin, -WSWin, -name) -> DivWinners2

DivWinners2<- DivWinners2%>%
  mutate_at(vars(divID),list(factor))

```

```{r}
set.seed(123)
training.samples2 <- DivWinners2$divID%>%
  createDataPartition(p = 0.8, list = FALSE)
train.data2  <- DivWinners2[training.samples2, ]
test.data2 <- DivWinners2 [-training.samples2, ]
```



#For C
```{r}
roctrain_c<-roc(response=train.data2$divID=="C",predictor=predtrain,plot=TRUE,main="ROC Curve for DivWin",auc=TRUE)
roctest_c<- roc(response=test.data2$divID=="C",predictor=predtest,plot=TRUE,auc=TRUE,add=TRUE,col=2)
legend(0,0.4,legend=c("training","testing"),fill=1:2)
```


```{r}
youden_id_train_c<-coords(roctrain_c,"b",best.method="youden",transpose=TRUE)
youden_id_train_c
youden_id_train_c[2]+youden_id_train_c[3]
```

```{r}
youden_id_test_c<-coords(roctest_c,x=0.01842452 ,transpose=TRUE)
youden_id_test_c
youden_c<- youden_id_test_c[2]+youden_id_test_c[3]
```

#For E
```{r}
roctrain_e<-roc(response=train.data2$divID=="E",predictor=predtrain,plot=TRUE,main="ROC Curve for DivWin",auc=TRUE)
roctest_e<- roc(response=test.data2$divID=="E",predictor=predtest,plot=TRUE,auc=TRUE,add=TRUE,col=2)
legend(0,0.4,legend=c("training","testing"),fill=1:2)
```


```{r}
youden_id_train_e<-coords(roctrain_e,"b",best.method="youden",transpose=TRUE)
youden_id_train_e
youden_id_train_e[2]+youden_id_train_e[3]
```

```{r}
youden_id_test_e<-coords(roctest_e,x=0.3821811 ,transpose=TRUE)
youden_id_test_e
youden_e<- youden_id_test_e[2]+youden_id_test_e[3]
```


#For W

```{r}
roctrain_w<-roc(response=train.data2$divID=="W",predictor=predtrain,plot=TRUE,main="ROC Curve for DivWin",auc=TRUE)
roctest_w<- roc(response=test.data2$divID=="W",predictor=predtest,plot=TRUE,auc=TRUE,add=TRUE,col=2)
legend(0,0.4,legend=c("training","testing"),fill=1:2)
```


```{r}
youden_id_train_w<-coords(roctrain_e,"b",best.method="youden",transpose=TRUE)
youden_id_train_w
youden_id_train_w[2]+youden_id_train_w[3]
```

```{r}
youden_id_test_w<-coords(roctest_w,x=0.3821811 ,transpose=TRUE)
youden_id_test_w
youden_w<- youden_id_test_w[2]+youden_id_test_w[3]
```

```{r}
youden_c
youden_e
youden_w
```

``````{r}
divid_types <- c("E", "W", "C")
sens_spec <- c(youden_e,youden_w,youden_c  )
```

```{r}
bar_tab<- as_tibble(data.frame(divid_types ,sens_spec ))
```

```{r}
ggplot(bar_tab, aes(divid_types, sens_spec ))+
  geom_col()+
  theme_classic()
```


*Is the prediction equally good for all divisions?*
No necessarily ...


